plot_ly() and ggplotly().plot_geo().We will work with two Starbucks datasets, one on the store locations (global) and one for the nutritional data for their food and drink items. We will do some text analysis of the menu items.
Upload an html file to Quercus and make sure the figures remain interactive.
sb_locs <- read_csv("https://raw.githubusercontent.com/JSC370/JSC370-2025/refs/heads/main/data/starbucks/starbucks-locations.csv")
## Rows: 25600 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Store Number, Store Name, Ownership Type, Street Address, City, St...
## dbl (2): Longitude, Latitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sb_nutr <- read_csv("https://raw.githubusercontent.com/JSC370/JSC370-2025/refs/heads/main/data/starbucks/starbucks-menu-nutrition.csv")
## Rows: 205 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Item, Category
## dbl (5): Calories, Fat (g), Carb. (g), Fiber (g), Protein (g)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa_pop <- read_csv("https://raw.githubusercontent.com/JSC370/JSC370-2025/refs/heads/main/data/starbucks/us_state_pop.csv")
## Rows: 55 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): state
## dbl (1): population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa_states<-read_csv("https://raw.githubusercontent.com/JSC370/JSC370-2025/refs/heads/main/data/starbucks/states.csv")
## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, Abbreviation
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sb_locs
## # A tibble: 25,600 × 12
## `Store Number` `Store Name` `Ownership Type` `Street Address` City
## <chr> <chr> <chr> <chr> <chr>
## 1 47370-257954 Meritxell, 96 Licensed Av. Meritxell, … Ando…
## 2 22331-212325 Ajman Drive Thru Licensed 1 Street 69, Al… Ajman
## 3 47089-256771 Dana Mall Licensed Sheikh Khalifa … Ajman
## 4 22126-218024 Twofour 54 Licensed Al Salam Street Abu …
## 5 17127-178586 Al Ain Tower Licensed Khaldiya Area, … Abu …
## 6 17688-182164 Dalma Mall, Ground Fl… Licensed Dalma Mall, Mus… Abu …
## 7 18182-182165 Dalma Mall, Level 1 Licensed Dalma Mall, Mus… Abu …
## 8 23359-229184 Debenhams Yas Mall Licensed Yas Island Abu …
## 9 30781-99022 Khalidiya Street Licensed Khalidiya St. Abu …
## 10 20423-205465 Eastern Mangroves Licensed Al Salam Street… Abu …
## # ℹ 25,590 more rows
## # ℹ 7 more variables: `State/Province` <chr>, Country <chr>, Postcode <chr>,
## # `Phone Number` <chr>, Timezone <chr>, Longitude <dbl>, Latitude <dbl>
sb_nutr
## # A tibble: 205 × 7
## Item Category Calories `Fat (g)` `Carb. (g)` `Fiber (g)` `Protein (g)`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Chonga Bag… Food 300 5 50 3 12
## 2 8-Grain Ro… Food 380 6 70 7 10
## 3 Almond Cro… Food 410 22 45 3 10
## 4 Apple Frit… Food 460 23 56 2 7
## 5 Banana Nut… Food 420 22 52 2 6
## 6 Blueberry … Food 380 16 53 1 6
## 7 Blueberry … Food 420 17 61 2 5
## 8 Butter Cro… Food 240 12 28 1 5
## 9 Butterfly … Food 350 22 38 0 2
## 10 Cheese Dan… Food 320 16 36 1 8
## # ℹ 195 more rows
usa_pop
## # A tibble: 55 × 2
## state population
## <chr> <dbl>
## 1 Alabama 4779736
## 2 Alaska 710231
## 3 Arizona 6392017
## 4 Arkansas 2915918
## 5 California 37253956
## 6 Colorado 5029196
## 7 Connecticut 3574097
## 8 Delaware 897934
## 9 District of Columbia 601723
## 10 Florida 18801310
## # ℹ 45 more rows
usa_states
## # A tibble: 51 × 2
## State Abbreviation
## <chr> <chr>
## 1 Alabama AL
## 2 Alaska AK
## 3 Arizona AZ
## 4 Arkansas AR
## 5 California CA
## 6 Colorado CO
## 7 Connecticut CT
## 8 Delaware DE
## 9 District of Columbia DC
## 10 Florida FL
## # ℹ 41 more rows
# Subset Starbucks locations to only those in the US
sb_usa <- sb_locs |> filter(Country == "US")
# Count Starbucks stores by state
sb_locs_state <- sb_usa |>
group_by(`State/Province`) |>
summarize(store_count = n())
# Rename the state abbreviation column in the states dataset
usa_states <- usa_states |> rename(state = State, abbreviation = Abbreviation)
# Merge state population data with state abbreviations
usa_pop_abbr <- full_join(usa_pop, usa_states, by = c("state" = "state"))
# Merge the Starbucks store count data with the population dataset
sb_locs_state <- full_join(sb_locs_state, usa_pop_abbr,
by = c("State/Province" = "abbreviation"))
# Inspect the range of values
summary(sb_locs_state)
## State/Province store_count state population
## Length:55 Min. : 8.0 Length:55 Min. : 56882
## Class :character 1st Qu.: 56.5 Class :character 1st Qu.: 1344331
## Mode :character Median : 123.0 Mode :character Median : 3751351
## Mean : 266.8 Mean : 5677621
## 3rd Qu.: 332.0 3rd Qu.: 6515716
## Max. :2821.0 Max. :37253956
## NA's :4
ggplotly for EDAAnswer the following questions:
Are the number of Starbucks proportional to the population of a state? (scatterplot)
Is the caloric distribution of Starbucks menu items different for drinks and food? (histogram)
What are the top 20 words in Starbucks menu items? (bar plot)
# Scatterplot of Starbucks stores vs. population
p1 <- sb_locs_state |>
ggplot(aes(x = population, y = store_count, text = state)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Starbucks Stores vs. State Population",
x = "Population",
y = "Number of Starbucks Stores") +
theme_minimal()
ggplotly(p1, tooltip = "text")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).
# Ensure there's a category column to distinguish drinks vs. food
p2 <- sb_nutr |>
ggplot(aes(x = Calories, fill = Category)) +
geom_histogram(bins = 30, position = "identity", alpha = 0.6) +
labs(title = "Caloric Distribution: Drinks vs. Food",
x = "Calories",
y = "Count",
fill = "Category") +
theme_minimal()
ggplotly(p2)
# Tokenizing menu items into words
top_words <- sb_nutr |>
unnest_tokens(word, `Item`) |> # Assuming 'Beverage_category' contains item names
anti_join(stop_words, by = "word") |> # Remove common stopwords
count(word, sort = TRUE) |>
slice_max(n, n = 20)
# Bar plot of top 20 words
p3 <- top_words |>
ggplot(aes(x = reorder(word, n), y = n, fill = word)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Top 20 Words in Starbucks Menu Items",
x = "Word",
y = "Count") +
theme_minimal()
ggplotly(p3)
plot_ly()plot_ly() representing the
relationship between calories and carbs. Color the points by category
(food or beverage). Is there a relationship, and do food or beverages
tend to have more calories?# Scatterplot of Calories vs. Carbs using plot_ly
plot_ly(data = sb_nutr,
x = ~`Carb. (g)`,
y = ~Calories,
color = ~Category, # Coloring by food or beverage category
type = "scatter",
mode = "markers",
marker = list(size = 8, opacity = 0.6)) %>%
layout(title = "Calories vs. Carbohydrates in Starbucks Menu Items",
xaxis = list(title = "Carbohydrates (g)"),
yaxis = list(title = "Calories"),
legend = list(title = list(text = "Category")))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
5a) Answer: There seems to be a positive correlation betweenc carbs and calories. Food, on average, has higher calories than drinks.
Repeat this scatterplot but for the items that include the top 10
words. Color again by category, and add hoverinfo specifying the word in
the item name. Add layout information to title the chart and the axes,
and enable hovermode = "compare".
What are the top 10 words and is the plot much different than above?
# Tokenizing menu items into words and counting occurrences
top_10_words <- sb_nutr |>
unnest_tokens(word, `Item`) |> # Assuming 'Beverage_category' contains item names
anti_join(stop_words, by = "word") |> # Remove common stopwords
count(word, sort = TRUE) |>
slice_max(n, n = 10) |>
pull(word) # Extract as a vector
# Create a regex pattern to match any of the top 10 words
pattern <- paste0("\\b(", paste(top_10_words, collapse = "|"), ")\\b")
# Filter for menu items that contain at least one of the top 10 words
sb_nutr_filtered <- sb_nutr |>
filter(str_detect(`Item`, regex(pattern, ignore_case = TRUE))) |>
mutate(matching_word = str_extract(`Item`, regex(pattern, ignore_case = TRUE)))
plot_ly(data = sb_nutr_filtered,
x = ~`Carb. (g)`,
y = ~Calories,
color = ~Category, # Coloring by food or beverage category
text = ~paste("Word:", matching_word, "<br>Item:", `Item`),
type = "scatter",
mode = "markers",
marker = list(size = 8, opacity = 0.7)) %>%
layout(title = "Calories vs. Carbohydrates for Items Containing Top 10 Words",
xaxis = list(title = "Carbohydrates (g)"),
yaxis = list(title = "Calories"),
legend = list(title = list(text = "Category")),
hovermode = "compare") # Enables comparing hover tooltips
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
plot_ly Boxplots# Convert data to long format for boxplot visualization
sb_nutr_long <- sb_nutr_filtered |>
select(matching_word, Calories, `Fat (g)`, `Carb. (g)`, `Fiber (g)`, `Protein (g)`) |> # Select only nutrition columns
pivot_longer(cols = Calories:`Protein (g)`, # Convert all nutrient columns into long format
names_to = "Nutrient",
values_to = "Value")
plot_ly() |>
# Boxplot for Calories (Primary Y-Axis)
add_trace(data = sb_nutr_long |> filter(Nutrient == "Calories"),
x = ~matching_word,
y = ~Value,
type = "box",
name = "Calories",
marker = list(color = "red"),
yaxis = "y1") |>
# Boxplots for Other Nutrients (Secondary Y-Axis)
add_trace(data = sb_nutr_long |> filter(Nutrient != "Calories"),
x = ~matching_word,
y = ~Value,
type = "box",
name = ~Nutrient,
color = ~Nutrient,
yaxis = "y2") |>
layout(title = "Nutritional Distribution of Top 10 Starbucks Menu Words",
xaxis = list(title = "Top 10 Words in Menu Items"),
# Primary Y-Axis (Calories)
yaxis = list(title = "Calories",
side = "left",
showgrid = FALSE),
# Secondary Y-Axis (Other Nutrients)
yaxis2 = list(title = "Other Nutrients (g)",
overlaying = "y",
side = "right",
showgrid = FALSE),
boxmode = "group")
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
# Create a 3D scatterplot
plot_ly(data = sb_nutr_filtered,
x = ~`Carb. (g)`,
y = ~Calories,
z = ~`Protein (g)`,
color = ~matching_word, # Color by top word
type = "scatter3d",
mode = "markers",
marker = list(size = 5, opacity = 0.6)) %>%
layout(title = "3D Scatterplot of Calories, Carbs, and Protein for Top 10 Words",
scene = list(
xaxis = list(title = "Carbohydrates (g)"),
yaxis = list(title = "Calories"),
zaxis = list(title = "Protein (g)")
))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_ly Map# Create hover text
# Create hover text
sb_locs_state$hover <- with(sb_locs_state, paste("Number of Starbucks: ", store_count, '<br>', "State: ", state, '<br>', "Population: ", population))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('steelblue')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create the map
# Create a map for the number of stores
map1 <- plot_geo(sb_locs_state, locationmode = 'USA-states') %>%
add_trace(
z = ~store_count,
locations = ~`State/Province`,
color = ~store_count,
colors = 'Reds',
hoverinfo = 'text',
text = ~hover
) %>%
layout(
title = "Number of Starbucks Stores per State",
geo = set_map_details
)
# Create a map for the population
map2 <- plot_geo(sb_locs_state, locationmode = 'USA-states') %>%
add_trace(
z = ~population,
locations = ~`State/Province`,
color = ~population,
colors = 'Blues',
hoverinfo = 'text',
text = ~hover
) %>%
layout(
title = "Population per State",
geo = set_map_details
)
# Place maps side by side
subplot(map1, map2, nrows = 1, shareX = TRUE, shareY = TRUE) %>%
layout(
title = "Starbucks Stores and Population by State"
)
## Warning: Ignoring 4 observations